1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
library(TH.data)
library(psych)
library(whitening)
library("vioplot")
library("rpart")
library(mlbench)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

Source W. Nick Street, Olvi L. Mangasarian and William H. Wolberg (1995). An inductive learning approach to prognostic prediction. In A. Prieditis and S. Russell, editors, Proceedings of the Twelfth International Conference on Machine Learning, pages 522–530, San Francisco, Morgan Kaufmann.

Peter Buehlmann and Torsten Hothorn (2007), Boosting algorithms: regularization, prediction and model fitting. Statistical Science, 22(4), 477–505.

1.2 The Data

wpbc {TH.data}


data("wpbc", package = "TH.data")
table(wpbc[,"status"])
#> 
#>   N   R 
#> 151  47
sum(1*(wpbc[,"status"]=="R" &  wpbc$time <= 24))
#> [1] 29
wpbc <- subset(wpbc,time > 36 | status=="R" )
summary(wpbc$time)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>    1.00   36.75   60.50   58.79   78.75  125.00
wpbc[,"status"] <- 1*(wpbc[,"status"]=="R")
wpbc <- wpbc[complete.cases(wpbc),]
pander::pander(table(wpbc[,"status"]))
0 1
91 46
wpbc$time <- NULL

1.2.0.1 Standarize the names for the reporting

studyName <- "Wisconsin"
dataframe <- wpbc
outcome <- "status"
thro <- 0.4
TopVariables <- 10
cexheat = 0.25

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
137 32
pander::pander(table(dataframe[,outcome]))
0 1
91 46

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9961379

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 32 , Uni p: 0.0233331 , Uncorrelated Base: 4 , Outcome-Driven Size: 0 , Base Size: 4 
#> 
#> 
 1 <R=0.996,r=0.973,N=    6>, Top: 2( 2 )[ 1 : 2 Fa= 2 : 0.973 ]( 2 , 4 , 0 ),<|>Tot Used: 6 , Added: 4 , Zero Std: 0 , Max Cor: 0.968
#> 
 2 <R=0.968,r=0.959,N=    6>, Top: 1( 1 )[ 1 : 1 Fa= 3 : 0.959 ]( 1 , 1 , 2 ),<|>Tot Used: 8 , Added: 1 , Zero Std: 0 , Max Cor: 0.911
#> 
 3 <R=0.911,r=0.905,N=    4>, Top: 2( 1 )[ 1 : 2 Fa= 3 : 0.905 ]( 2 , 2 , 3 ),<|>Tot Used: 9 , Added: 2 , Zero Std: 0 , Max Cor: 0.895
#> 
 4 <R=0.895,r=0.847,N=   11>, Top: 6( 1 )[ 1 : 6 Fa= 8 : 0.847 ]( 5 , 5 , 3 ),<|>Tot Used: 18 , Added: 5 , Zero Std: 0 , Max Cor: 0.842
#> 
 5 <R=0.842,r=0.821,N=   11>, Top: 3( 1 )[ 1 : 3 Fa= 8 : 0.821 ]( 3 , 3 , 8 ),<|>Tot Used: 20 , Added: 3 , Zero Std: 0 , Max Cor: 0.796
#> 
 6 <R=0.796,r=0.598,N=   11>, Top: 5( 3 )[ 1 : 5 Fa= 12 : 0.598 ]( 5 , 9 , 8 ),<|>Tot Used: 23 , Added: 9 , Zero Std: 0 , Max Cor: 0.785
#> 
 7 <R=0.785,r=0.592,N=   11>, Top: 4( 1 )[ 1 : 4 Fa= 14 : 0.592 ]( 4 , 6 , 12 ),<|>Tot Used: 26 , Added: 6 , Zero Std: 0 , Max Cor: 0.739
#> 
 8 <R=0.739,r=0.570,N=   11>, Top: 3( 2 )[ 1 : 3 Fa= 14 : 0.570 ]( 3 , 4 , 14 ),<|>Tot Used: 28 , Added: 4 , Zero Std: 0 , Max Cor: 0.560
#> 
 9 <R=0.560,r=0.480,N=   11>, Top: 9[ 2 ]( 1 )=[ 2 : 9 Fa= 16 : 0.531 ]( 8 , 11 , 14 ),<|>Tot Used: 32 , Added: 11 , Zero Std: 0 , Max Cor: 0.582
#> 
 10 <R=0.582,r=0.491,N=   11>, Top: 4( 1 )[ 1 : 4 Fa= 17 : 0.491 ]( 4 , 4 , 16 ),<|>Tot Used: 32 , Added: 4 , Zero Std: 0 , Max Cor: 0.765
#> 
 11 <R=0.765,r=0.583,N=   11>, Top: 2( 1 )[ 1 : 2 Fa= 18 : 0.583 ]( 2 , 2 , 17 ),<|>Tot Used: 32 , Added: 2 , Zero Std: 0 , Max Cor: 0.522
#> 
 12 <R=0.522,r=0.461,N=    8>, Top: 3( 3 )[ 1 : 3 Fa= 18 : 0.461 ]( 2 , 5 , 18 ),<|>Tot Used: 32 , Added: 5 , Zero Std: 0 , Max Cor: 0.554
#> 
 13 <R=0.554,r=0.477,N=    8>, Top: 1( 1 )[ 1 : 1 Fa= 18 : 0.477 ]( 1 , 1 , 18 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.454
#> 
 14 <R=0.454,r=0.427,N=   11>, Top: 5( 1 )[ 1 : 5 Fa= 20 : 0.427 ]( 5 , 6 , 18 ),<|>Tot Used: 32 , Added: 6 , Zero Std: 0 , Max Cor: 0.579
#> 
 15 <R=0.579,r=0.540,N=   11>, Top: 1( 1 )[ 1 : 1 Fa= 20 : 0.540 ]( 1 , 1 , 20 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.422
#> 
 16 <R=0.422,r=0.400,N=    6>, Top: 3( 1 )[ 1 : 3 Fa= 21 : 0.400 ]( 3 , 3 , 20 ),<|>Tot Used: 32 , Added: 3 , Zero Std: 0 , Max Cor: 0.478
#> 
 17 <R=0.478,r=0.400,N=    6>, Top: 1( 1 )[ 1 : 1 Fa= 21 : 0.400 ]( 1 , 1 , 21 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.390
#> 
 18 <R=0.390,r=0.400,N=    0>
#> 
 [ 18 ], 0.3902384 Decor Dimension: 32 Nused: 32 . Cor to Base: 21 , ABase: 4 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

515156

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

4916

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

1.39

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

1.37

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.3902384

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

1.8.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")



univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
tsize 3.47 2.03 2.64 1.86 1.11e-03 0.666
pnodes 4.87 6.02 2.63 5.21 6.25e-09 0.650
worst_radius 22.67 4.70 20.35 4.08 3.68e-01 0.647
worst_perimeter 151.33 32.42 135.34 26.85 5.71e-01 0.645
mean_area 1081.98 397.26 888.40 310.85 1.26e-01 0.645
worst_area 1635.77 703.15 1317.95 550.94 2.72e-01 0.643
mean_perimeter 121.10 22.91 110.02 19.19 4.72e-01 0.641
mean_radius 18.33 3.37 16.70 2.91 3.12e-01 0.639
SE_perimeter 4.73 2.21 3.81 1.80 6.37e-02 0.634
SE_area 81.97 53.36 61.22 37.72 6.46e-02 0.632


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
tsize 3.47174 2.02985 2.63846 1.85507 0.00111 0.666
La_worst_area 228.53323 47.29391 250.81270 52.75337 0.15315 0.640
mean_radius 18.33087 3.36557 16.69945 2.91309 0.31204 0.639
La_mean_perimeter -5.45575 0.60054 -5.79148 0.74945 0.05357 0.633
La_SE_concavepoints 0.01479 0.00242 0.01568 0.00324 0.84498 0.599
La_worst_concavity 0.27693 0.04829 0.25813 0.05620 0.79863 0.597
La_SE_area -63.19916 13.30716 -61.29910 9.41088 0.98175 0.587
La_SE_smoothness 0.00489 0.00188 0.00550 0.00192 0.38636 0.573
La_mean_symmetry 0.15714 0.01905 0.16337 0.02694 0.18185 0.571
La_SE_symmetry -0.00325 0.00362 -0.00377 0.00526 0.01253 0.568

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
4.54 28 0.875


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
tsize 3.47e+00 2.03e+00 2.64e+00 1.86e+00 0.001110 0.666 0.666 1
worst_radius NA 2.27e+01 4.70e+00 2.03e+01 4.08e+00 0.368015 0.647 0.647 NA
worst_perimeter NA 1.51e+02 3.24e+01 1.35e+02 2.68e+01 0.570824 0.645 0.645 NA
mean_area NA 1.08e+03 3.97e+02 8.88e+02 3.11e+02 0.125673 0.645 0.645 NA
worst_area NA 1.64e+03 7.03e+02 1.32e+03 5.51e+02 0.271758 0.643 0.643 NA
mean_perimeter NA 1.21e+02 2.29e+01 1.10e+02 1.92e+01 0.471679 0.641 0.641 NA
La_worst_area + 178.320mean_radius -1.675mean_area + 42.622SE_perimeter -2.435SE_area -101.151worst_radius -3.788worst_perimeter + 1.000*worst_area 2.29e+02 4.73e+01 2.51e+02 5.28e+01 0.153154 0.640 0.643 -4
mean_radius 1.83e+01 3.37e+00 1.67e+01 2.91e+00 0.312037 0.639 0.639 10
SE_perimeter NA 4.73e+00 2.21e+00 3.81e+00 1.80e+00 0.063724 0.634 0.634 NA
La_mean_perimeter -6.657mean_radius + 1.000mean_perimeter -33.734mean_compactness + 0.581worst_radius -0.085*worst_perimeter -5.46e+00 6.01e-01 -5.79e+00 7.49e-01 0.053570 0.633 0.641 -3
SE_area NA 8.20e+01 5.34e+01 6.12e+01 3.77e+01 0.064638 0.632 0.632 NA
La_SE_concavepoints + 0.000mean_radius -0.001SE_perimeter -0.278SE_compactness + 1.000SE_concavepoints + 0.022*worst_compactness 1.48e-02 2.42e-03 1.57e-02 3.24e-03 0.844985 0.599 0.466 -1
La_worst_concavity + 0.010mean_radius + 1.570mean_compactness -1.480mean_concavity + 2.829SE_compactness -3.192SE_concavity -0.830worst_compactness + 1.000*worst_concavity 2.77e-01 4.83e-02 2.58e-01 5.62e-02 0.798630 0.597 0.492 -3
worst_symmetry NA 3.14e-01 6.15e-02 3.39e-01 8.23e-02 0.069522 0.596 0.596 NA
La_SE_area + 3.076mean_radius -17.504SE_perimeter + 1.000SE_area -14.926worst_radius + 1.452*worst_perimeter -6.32e+01 1.33e+01 -6.13e+01 9.41e+00 0.981753 0.587 0.632 -3
mean_symmetry NA 1.88e-01 2.11e-02 1.97e-01 3.12e-02 0.151020 0.580 0.580 NA
La_SE_smoothness + 1.000SE_smoothness -0.052SE_compactness -0.103SE_concavity + 0.011worst_compactness 4.89e-03 1.88e-03 5.50e-03 1.92e-03 0.386356 0.573 0.536 -1
La_mean_symmetry + 1.000mean_symmetry -0.086worst_compactness 1.57e-01 1.91e-02 1.63e-01 2.69e-02 0.181846 0.571 0.580 1
La_SE_symmetry + 0.044mean_symmetry -0.404SE_compactness + 1.000SE_symmetry + 0.048worst_compactness -0.115*worst_symmetry -3.25e-03 3.62e-03 -3.77e-03 5.26e-03 0.012531 0.568 0.504 -3
mean_concavity NA 1.62e-01 6.21e-02 1.51e-01 6.33e-02 0.499349 0.547 0.547 NA
worst_compactness NA 3.58e-01 1.31e-01 3.87e-01 1.79e-01 0.113263 0.538 0.538 NA
SE_smoothness NA 6.48e-03 2.00e-03 6.86e-03 3.30e-03 0.022967 0.536 0.536 NA
SE_compactness NA 3.08e-02 1.75e-02 3.13e-02 1.88e-02 0.021169 0.513 0.513 NA
mean_compactness NA 1.42e-01 4.07e-02 1.46e-01 5.30e-02 0.340243 0.505 0.505 NA
SE_symmetry NA 1.99e-02 9.17e-03 2.06e-02 1.08e-02 0.000118 0.504 0.504 NA
worst_concavity NA 4.40e-01 1.48e-01 4.42e-01 1.70e-01 0.238928 0.492 0.492 NA
SE_concavity NA 3.83e-02 1.61e-02 3.90e-02 2.12e-02 0.046100 0.478 0.478 NA
SE_concavepoints NA 1.44e-02 3.94e-03 1.45e-02 5.40e-03 0.107204 0.466 0.466 NA

1.10 Comparing IDeA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 68 23
1 9 37
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.438 0.3534 0.525
    tp 0.336 0.2574 0.421
    se 0.804 0.6609 0.906
    sp 0.747 0.6453 0.833
    diag.ac 0.766 0.6866 0.834
    diag.or 12.155 5.1002 28.966
    nndx 1.813 1.3532 3.267
    youden 0.552 0.3061 0.739
    pv.pos 0.617 0.4821 0.739
    pv.neg 0.883 0.7897 0.945
    lr.pos 3.182 2.1743 4.658
    lr.neg 0.262 0.1440 0.476
    p.rout 0.562 0.4747 0.647
    p.rin 0.438 0.3534 0.525
    p.tpdn 0.253 0.1675 0.355
    p.tndp 0.196 0.0936 0.339
    p.dntp 0.383 0.2607 0.518
    p.dptn 0.117 0.0549 0.210
  • tab:

      Outcome + Outcome - Total
    Test + 37 23 60
    Test - 9 68 77
    Total 46 91 137
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.766 0.687 0.834
3 se 0.804 0.661 0.906
4 sp 0.747 0.645 0.833
6 diag.or 12.155 5.100 28.966

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 85 6
1 24 22
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.2044 0.1403 0.282
    tp 0.3358 0.2574 0.421
    se 0.4783 0.3289 0.631
    sp 0.9341 0.8620 0.975
    diag.ac 0.7810 0.7024 0.847
    diag.or 12.9861 4.7298 35.655
    nndx 2.4253 1.6503 5.239
    youden 0.4123 0.1909 0.606
    pv.pos 0.7857 0.5905 0.917
    pv.neg 0.7798 0.6903 0.854
    lr.pos 7.2536 3.1625 16.637
    lr.neg 0.5586 0.4213 0.741
    p.rout 0.7956 0.7183 0.860
    p.rin 0.2044 0.1403 0.282
    p.tpdn 0.0659 0.0246 0.138
    p.tndp 0.5217 0.3695 0.671
    p.dntp 0.2143 0.0830 0.410
    p.dptn 0.2202 0.1465 0.310
  • tab:

      Outcome + Outcome - Total
    Test + 22 6 28
    Test - 24 85 109
    Total 46 91 137
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.781 0.702 0.847
3 se 0.478 0.329 0.631
4 sp 0.934 0.862 0.975
6 diag.or 12.986 4.730 35.655

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 86 5
1 27 19
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.1752 0.1156 0.249
    tp 0.3358 0.2574 0.421
    se 0.4130 0.2700 0.568
    sp 0.9451 0.8764 0.982
    diag.ac 0.7664 0.6866 0.834
    diag.or 12.1037 4.1275 35.493
    nndx 2.7925 1.8193 6.831
    youden 0.3581 0.1464 0.550
    pv.pos 0.7917 0.5785 0.929
    pv.neg 0.7611 0.6717 0.836
    lr.pos 7.5174 2.9985 18.846
    lr.neg 0.6211 0.4849 0.795
    p.rout 0.8248 0.7506 0.884
    p.rin 0.1752 0.1156 0.249
    p.tpdn 0.0549 0.0181 0.124
    p.tndp 0.5870 0.4323 0.730
    p.dntp 0.2083 0.0713 0.422
    p.dptn 0.2389 0.1637 0.328
  • tab:

      Outcome + Outcome - Total
    Test + 19 5 24
    Test - 27 86 113
    Total 46 91 137
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.766 0.687 0.834
3 se 0.413 0.270 0.568
4 sp 0.945 0.876 0.982
6 diag.or 12.104 4.128 35.493


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 90 1
1 38 8
  pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.0657 0.030477 0.1210
    tp 0.3358 0.257400 0.4214
    se 0.1739 0.078203 0.3142
    sp 0.9890 0.940289 0.9997
    diag.ac 0.7153 0.631995 0.7891
    diag.or 18.9474 2.289909 156.7760
    nndx 6.1378 3.185601 54.0757
    youden 0.1629 0.018493 0.3139
    pv.pos 0.8889 0.517503 0.9972
    pv.neg 0.7031 0.615994 0.7806
    lr.pos 15.8261 2.040645 122.7382
    lr.neg 0.8353 0.730259 0.9554
    p.rout 0.9343 0.878957 0.9695
    p.rin 0.0657 0.030477 0.1210
    p.tpdn 0.0110 0.000278 0.0597
    p.tndp 0.8261 0.685809 0.9218
    p.dntp 0.1111 0.002809 0.4825
    p.dptn 0.2969 0.219402 0.3840
  • tab:

      Outcome + Outcome - Total
    Test + 8 1 9
    Test - 38 90 128
    Total 46 91 137
  • method: exact

  • digits: 2

  • conf.level: 0.95

  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.715 0.6320 0.789
3 se 0.174 0.0782 0.314
4 sp 0.989 0.9403 1.000
6 diag.or 18.947 2.2899 156.776
  par(op)